#IMPORT LIBRARIES
import sqlite3
import os
import pandas as pd
import numpy as np
import random
from numpy import linalg as LA
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift, DBSCAN, estimate_bandwidth
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from itertools import product
from math import ceil
from scipy.cluster.hierarchy import dendrogram
%matplotlib inline
#from pandas_profiling import ProfileReport
%config InlineBackend.figure_format = 'retina'
from scipy.stats import iqr as IQR
from collections import Counter
import scipy.stats as stat
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from implicit.evaluation import ranking_metrics_at_k
from tqdm import tqdm
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc
from scipy.sparse import csr_matrix
import scipy.sparse as sparse
import warnings
warnings.filterwarnings('ignore')
# Set seaborn style
sns.set()
Collect initial data
#import file csv
df = pd.read_csv('retail.csv')
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)
#Show first 5 rows of data
df.head()
#Show last 5 rows of data
df.tail()
#Show info about the data
df.info()
#Replace empty cells by nam
df.replace("", np.nan, inplace=True)
# Summary statistics for all variables
df.describe(include='all').transpose()
# Function to show missing values and their percentages
def missing_data(df):
l = []
for col in df.columns:
missing = df[col].isnull().sum(axis=0)
percentage = missing/df.shape[0]
l.append([col,missing,percentage*100])
df_missing = pd.DataFrame(l, columns = ['Feature','missing','percentage'])
df_missing = df_missing.sort_values('missing',axis=0, ascending = False)
return df_missing
#Check Missing data
df_missing = missing_data(df)
df_missing = df_missing[df_missing['missing']>0]
df_missing
# Check duplicated observations
df.loc[df.duplicated(keep=False), :]
# Create a dataframe copy
df1 = df.copy()
# Drop duplicated rows
df1.drop_duplicates(inplace=True)
# Select irregular stock code values
irregular_stockcode = ['DCGS0076','DCGS0003','DCGS0070','DCGS0055', 'DCGS0072', 'DCGS0074', 'DCGS0069', 'DCGS0057',
'DCGSSBOY', 'DCGSSGIRL', 'DCGS0004', 'DCGS0073', 'DCGS0071', 'DCGS0068', 'DCGS0067', 'DCGS0066P',
'POST', 'D', 'DOT','M', 'BANK CHARGES', 'S', 'AMAZONFEE', 'gift_0001_40', 'gift_0001_50',
'gift_0001_30', 'gift_0001_20','gift_0001_10', 'PADS', 'B', 'CRUK', 'c2','C2','m']
# Remove records with irregular stock code values
df1 = df1[~ df1['StockCode'].isin(irregular_stockcode)]
#Remove transactions with price =0.
df1 = df1[df1['UnitPrice']>0]
print('Percentage of data kept after removing irregularities:', np.round(df1.shape[0] / df.shape[0], 4))
# 'Unspecified' will be kept because it does not affect the collaborative filtering algorithm
df1['Country'].value_counts()
#Remove duplicate descriptions to the same stockCode. The remaining ones are very similar so doesn't matter each one will be dropped;
#Create a new dataframe with the columns StockCode and Description
df_stock_desc = df1[['StockCode','Description']]
#Drop duplicates based on StockCode column
df_stock_desc = df_stock_desc.drop_duplicates(subset='StockCode', keep="last")
#Create a copy of df1
df2 = df1.copy()
#Merge df2 with the df_stock_desc to have only one description by StockCode
df3 = pd.merge(df2,df_stock_desc,on = 'StockCode', how = 'left')
#Check the results
df3
#Check if it worked - After remove duplicates based on StockCode + Description, we had only 3914 rows. Exactly the unique StockCodes that we have in original dataset.
df4 = df3[['StockCode','Description_y']]
df4.drop_duplicates(inplace = True)
df4
#Drop the columns with the descriptions duplicated
df3.drop(['Description_x'], axis=1, inplace=True)
#Change the name of the column with the unique description.
df3.rename(columns = {'Description_y':'Description'}, inplace = True)
#Copy the results to df1.
df1 = df3.copy()
df1
# Check missing data
df_missing = missing_data(df1)
df_missing = df_missing[df_missing['missing']>0]
df_missing
# Change 'InvoiceDate' data type to datetime, and create new column with the date respective year and month
df1['InvoiceDate'] = df1['InvoiceDate'].astype('datetime64[ns]')
df1['Month_Year'] = df1['InvoiceDate'].dt.to_period('M')
df1
# Create new dataframe
df_rs = df1.dropna().copy()
# Get max CustomerID
idMax = df1['CustomerID'].max()
# Get distinct invoice numbers for records where CustomerID is null
invoicesNullCustomer = df1[df1['CustomerID'].isnull()]['InvoiceNo'].unique()
# fill up the records where the customer id is null, giving a new customer id for the records containing the same invoice numbers
next_customerId = idMax+1
for invoiceNo in invoicesNullCustomer:
df1.loc[df1['InvoiceNo'] == invoiceNo, 'CustomerID'] = next_customerId
next_customerId +=1
# Check missing data
df_missing = missing_data(df1)
df_missing = df_missing[df_missing['missing']>0]
df_missing
# Show data info
df1.info()
# dataframe basic value stats
df1.describe(include='all').transpose()
#FINDING THE SALES PROPORTION OF EACH PRODUCT
#CREATE A GROUPED DATAFRAME BY STOCK CODE AND DESCRIPTION WITH QUANTITY SUMMED
selling_prods = pd.DataFrame(df1.groupby(['Description','StockCode']).sum()['Quantity'])
#FILTER ONLY PRODUCTS WITH QUANTITY SOLD HIGHER THAN 0
selling_prods = selling_prods[selling_prods['Quantity'] > 0]
#ORDER THE DATASET BY HIGHER QUANTITY FIRST
selling_prods.sort_values(by=['Quantity'], inplace= True, ascending=False)
#Create a % of itens sold of each product over the total
selling_prods['percentage'] = round(selling_prods.Quantity /selling_prods.Quantity.sum() *100,3)
#Create a cummulative % of itens sold
selling_prods['cumpercentage'] = selling_prods['percentage'].cumsum()
selling_prods.head(50)
It's possible to see that the sales are distributed by many products. The highest proportion is about 1,07%
best_selling_prods = selling_prods.head(10)
best_selling_prods.reset_index(level=0, inplace=True)
top10products=best_selling_prods.plot(x='Description', y= 'Quantity',rot=80, kind='bar')
top10products.set_ylabel('Products Sold')
plt.show()
#Find the Proportion of sales by country
#Create a grouped Dataframe by Country
selling_country = pd.DataFrame(df1.groupby(['Country']).sum()['Quantity'])
#Order the dataset for the higher quantity
selling_country.sort_values(by=['Quantity'], inplace= True, ascending=False)
#Create a % of quantity sold of each country over the total
selling_country['percentage'] = round(selling_country.Quantity /selling_country.Quantity.sum() *100,3)
#Create a cummulative % of itens sold
#selling_prods['cumpercentage'] = selling_prods['percentage'].cumsum()
selling_country.reset_index(inplace = True)
selling_country
#Define the labels and the data
labels =selling_country['Country']
values = selling_country['percentage']
data = dict(type='pie',labels=labels,
values=values,)
#Define the layout
layout = dict(title=dict(text='Volume by Country - % share'))
#Create the figure
fig = go.Figure(data= data, layout=layout)
fig.update_traces(textposition='inside', textinfo='percent+label')
#fig.update_layout(showlegend=False, uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()
#FINDING THE MOST SELLING PRODUCT BY COUNTRY
selling_prods_country = df1.groupby(['Country', 'StockCode','Description'])['Quantity'].agg('sum').reset_index()
max_selling_prods_country = selling_prods_country.groupby("Country").max()
max_selling_prods_country
#Distribution the volume sold by month
by_month=df1.groupby(['Month_Year'])['Quantity'].agg('sum')
by_month.plot()
#Show data
by_month
#Remove the rows of each invoice having unique invoices
number_orders = df1.drop_duplicates(subset='InvoiceNo', keep="last")
#grouped by Customer id
number_orders = number_orders['CustomerID'].value_counts()
#Sort the values by the customer with highest number of invoices
number_orders = number_orders.sort_values(ascending=False,axis=0)
#Reset index
number_orders = number_orders.reset_index()
#Rename columns
number_orders.rename(columns={'index': 'customer_id'}, inplace = True)
number_orders.rename(columns={'CustomerID': 'number_invoices'}, inplace = True)
#Group the dataframe by quantity of customers according by number of invoice placed
number_orders= number_orders.groupby("number_invoices")['customer_id'].agg(['count'])
#Reset index
number_orders = number_orders.reset_index()
#Show
number_orders
#Create a Plotly Visualization to be included in dashboard - Line plot
number_orders5 = number_orders[number_orders['count']>5]
#Define the dataset to be used
df = number_orders5
#Define the data to plot
data = dict(type='scatter',
x=df['number_invoices'],
y=df['count'],
name='Quantity of customers by quantity of invoices'
)
#Define the layout
layout = dict(title=dict(text='Quantity of invoices'),
xaxis=dict(title='Number of invoices'),
yaxis=dict(title='Quantity of customers')
)
#Create the figure
fig_users = go.Figure(data, layout)
fig_users.show()
#CANCELED ITEMS
cancelitems= df1.loc[df1['Quantity'] < 0]
cancelitems
#MOST CANCELS ITEMS
cancelitemsprod=cancelitems.groupby(['StockCode','Description']).sum()['Quantity'].reset_index()
cancelitemsprod['Quantity']=cancelitemsprod['Quantity']*-1
cancelitemsprod = cancelitemsprod.sort_values(by='Quantity', ascending=False)
cancelitemsprod
top10cancelitems=cancelitemsprod.head(10)
top10cancelitems=top10cancelitems.plot(x='Description', y='Quantity', rot=80, kind='bar')
top10cancelitems.set_title('Top 10 products cancelled')
top10cancelitems.set_xlabel('ProductID')
top10cancelitems.set_ylabel('Quantity of cancelled products')
#FINDING MOST COMMOM CANCELLED PRODUCT BY COUNTRY
cancelcountry=cancelitems.groupby(['Country','StockCode'])['Quantity'].agg('sum').reset_index()
min_cancelcountry=cancelcountry.groupby(['Country']).min()
min_cancelcountry['Quantity']=min_cancelcountry['Quantity']*-1
min_cancelcountry
#Check info about dataframe
df_rs.info()
# Reduce sparcity by excluding users and items without significant event history
def threshold_ratings(df, uid_min, iid_min, max_iter=None):
"""Removes users and items with less than uid_min and iid_min event occurrences, respectively.
Credits: https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/
"""
n_users = df['CustomerID'].nunique()
n_items = df['StockCode'].nunique()
sparsity = float(df.shape[0]) / float(n_users * n_items) * 100
print('Raw dataset info \n-----------------')
print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))
done, i = False, 0
while not done:
# When we exclude users with freq less than uid_min we might end up with new
# items with freq less than iid_min, so we will have to alternate back and forth
starting_shape = df.shape[0] # number of existing events
uid_counts = df.groupby('CustomerID').size() # user id frequencies
df = df[~df['CustomerID'].isin(uid_counts[uid_counts < uid_min].index.tolist())] # keep events with users with frequency >= uid_min
iid_counts = df.groupby('StockCode').size() # item id frequencies
df = df[~df['StockCode'].isin(iid_counts[iid_counts < iid_min].index.tolist())] # keep events with items with frequency >= iid_min
ending_shape = df.shape[0] # number of existing events after filters
i += 1
if starting_shape == ending_shape or i == max_iter: # convergence happens
done = True
if not max_iter:
assert(df.groupby('CustomerID').size().min() >= uid_min)
assert(df.groupby('StockCode').size().min() >= iid_min)
n_users = df['CustomerID'].nunique()
n_items = df['StockCode'].nunique()
sparsity = float(df.shape[0]) / float(n_users * n_items) * 100
print('Limited dataset info \n-----------------')
print('Number of iterations until convergence: {}'.format(i))
print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))
return df
# get limited dataset
df_limited = threshold_ratings(df_rs, 5, 5)
# Create dataframe grouped by users, products and summing up the quantities
# Instead of representing an explicit rating
# records with a larger number of quantity by a person can carry more weight in our matrix
grouped_df = df_limited.groupby(['CustomerID', 'StockCode'])['Quantity'].sum()
grouped_df = pd.DataFrame(grouped_df)
grouped_df.reset_index(inplace=True)
# Show dataframe
grouped_df
# Remove rows where the quantity is 0
grouped_df = grouped_df[grouped_df['Quantity'] > 0]
# Show dataframe
grouped_df
# Change column types
grouped_df['Quantity'] = grouped_df['Quantity'].astype(float)
grouped_df['CustomerID'] = grouped_df['CustomerID'].astype("category")
grouped_df['StockCode'] = grouped_df['StockCode'].astype("category")
# Create two codes columns
grouped_df['Customer_ID'] = grouped_df['CustomerID'].cat.codes
grouped_df['Stock_Code'] = grouped_df['StockCode'].cat.codes
# Show dataframe
grouped_df
# Create two matrices, one for fitting the model (item-user) and one for recommendations (user-item)
sparse_item_user = csr_matrix((grouped_df['Quantity'], (grouped_df['Stock_Code'], grouped_df['Customer_ID'])))
sparse_user_item = csr_matrix((grouped_df['Quantity'], (grouped_df['Customer_ID'], grouped_df['Stock_Code'])))
Create functino to take in the original user-item matrix and "mask" a percentage of the original interactions where a user-item interaction has taken place for use as a test set. The test set will contain all of the original ratings, while the training set replaces the specified percentage of them with a zero in the original ratings matrix.
# Function to create the training and testing sets
def make_train(scores, pct_test = 0.2):
# Make a copy of the original set to be the test set
test_set = scores.copy()
# Store the test set as a binary preference matrix
test_set[test_set != 0] = 1
# Make a copy of the original data that can be altered as the training set
training_set = scores.copy()
# Find the indices in the ratings data where an interaction exists
nonzero_inds = training_set.nonzero()
# Zip these pairs together of item,user index into list
nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))
# Set the random seed to zero for reproducibility
random.seed(0)
# Round the number of samples needed to the nearest integer
num_samples = int(np.ceil(pct_test*len(nonzero_pairs)))
# Sample a random number of item-user pairs without replacement
samples = random.sample(nonzero_pairs, num_samples)
# Get the item row indices
content_inds = [index[0] for index in samples]
# Get the user column indices
person_inds = [index[1] for index in samples]
# Assign all of the randomly chosen user-item pairs to zero
training_set[content_inds, person_inds] = 0
# Get rid of zeros in sparse array storage after update to save space
training_set.eliminate_zeros()
# Return results
return training_set, test_set, list(set(person_inds))
# Apply function and store the results
items_train, items_test, item_user_altered = make_train(sparse_item_user, pct_test = 0.2)
# Initialize the Alternating Least Squares (ALS) model with initial parameters
als_model = AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)
# Fit the model using the training sparse content-person matrix
alpha = 15
data = (items_train * alpha).astype('double')
als_model.fit(data)
# Initialize BayesianPersonalizedRanking (BPR) model with initial parameters
bpr_model = BayesianPersonalizedRanking(factors=20, regularization=0.1, iterations=50)
# Fit the model using the training sparse content-person matrix
alpha = 15
data = (items_train * alpha).astype('double')
bpr_model.fit(data)
# Initialize LogisticMatrixFactorization (LFM) model with initial parameters
lmf_model = LogisticMatrixFactorization(factors=20, regularization=0.1, iterations=50)
# Fit the model using the training sparse content-person matrix
alpha = 15
data = (items_train * alpha).astype('double')
lmf_model.fit(data)
# Get the trained user and item vectors. Convert them to csr matrices
#AlS model
user_vecs_ALS = sparse.csr_matrix(als_model.user_factors)
item_vecs_ALS = sparse.csr_matrix(als_model.item_factors)
#BPR model
user_vecs_BPR = sparse.csr_matrix(bpr_model.user_factors)
item_vecs_BPR = sparse.csr_matrix(bpr_model.item_factors)
#LMF model
user_vecs_LMF = sparse.csr_matrix(lmf_model.user_factors)
item_vecs_LMF = sparse.csr_matrix(lmf_model.item_factors)
Check if the order of recommendations given for each user matches the items they ended up purchasing. The metric that will be used for this evaluation is the area under the Receiver Operating Characteristic (or ROC) curve. A greater area under the curve means that the system is recommending items that end up being purchased near the top of the list of recommended items.
# Helper function to calculate the mean area under the curve (AUC)
def auc_score(predictions, test):
fpr, tpr, thresholds = roc_curve(test, predictions)
return auc(fpr, tpr)
# Function to calculate the AUC for each user in the training set that had at least one item masked
# Also calculate AUC for the most popular items for the users to compare
def calc_mean_auc(training_set, altered_persons, predictions, test_set):
# An empty list to store the AUC for each user that had an item removed from the training set
store_auc = []
# To store popular AUC scores
popularity_auc = []
# Get sum of item iteractions to find most popular
pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1)
content_vecs = predictions[1]
# Iterate through each user that had an item altered
for person in altered_persons:
# Get the training set column
training_column = training_set[:,person].toarray().reshape(-1)
# Find where the interaction had not yet occurred
zero_inds = np.where(training_column == 0)
# Get the predicted values based on our user/item vectors
person_vec = predictions[0][person,:]
pred = person_vec.dot(content_vecs).toarray()[0,zero_inds].reshape(-1)
# Get only the items that were originally zero
# Select all ratings from the MF prediction for this user that originally had no iteraction
actual = test_set[:,person].toarray()[zero_inds,0].reshape(-1)
# Select the binarized yes/no interaction pairs from the original full data
# that align with the same pairs in training
# Get the item popularity for our chosen items
pop = pop_contents[zero_inds]
# Calculate AUC for the given user and store
store_auc.append(auc_score(pred, actual))
# Calculate AUC using most popular and score
popularity_auc.append(auc_score(pop, actual))
# left-hand side value is the auc score from the chosen model
# right-hand side value is the auc score from the popular model
return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))
# Apply function - ALS model
calc_mean_auc(items_train, item_user_altered,[user_vecs_ALS, item_vecs_ALS.T], items_test)
# Apply function - BPR model
calc_mean_auc(items_train, item_user_altered,[user_vecs_BPR, item_vecs_BPR.T], items_test)
# Apply function - LMF model
calc_mean_auc(items_train, item_user_altered,[user_vecs_LMF, item_vecs_LMF.T], items_test)
# Plot initial results
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
als_score, pop_score = calc_mean_auc(items_train, item_user_altered,[user_vecs_ALS, item_vecs_ALS.T], items_test)
pbr_score, pop_score = calc_mean_auc(items_train, item_user_altered,[user_vecs_BPR, item_vecs_BPR.T], items_test)
lmf_score, pop_score = calc_mean_auc(items_train, item_user_altered,[user_vecs_LMF, item_vecs_LMF.T], items_test )
models_names = ['ALS', 'BPR', 'LMF']
models_scores = [als_score, pbr_score, lmf_score]
ax.bar(models_names,models_scores)
plt.title("Recommendation models initial AUC scores")
plt.show()
# hyper-parameters tuning function
def tune_hp(recomendation_model, sparse_item_user_matrix):
# define
best_score = 0
best_params = {'alpha': 0, 'latent_factors': 0, 'regularizations': 0, 'iterations': 0}
# define parameters grid
alphas = [10, 15, 20, 25, 30,35, 40]
latent_factors = [10, 20, 30, 40, 50, 60]
regularizations = [0.01, 0.05, 0.1, 0.5, 1.1, 1.5]
iterations = [10, 20, 30, 40, 50, 60]
# get training and validation sets
items_train, items_test, item_user_altered = make_train(sparse_item_user_matrix, pct_test = 0.2)
# interate over parameters
for alpha in alphas:
for factor in latent_factors:
for regularization in regularizations:
for iteration in iterations:
# create data to train the model
data = (items_train * alpha).astype('double')
# initialize model
model = recomendation_model(factors = factor, regularization = regularization, iterations = iteration)
# fit model
model.fit(data, show_progress = False)
# create vectors
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)
# get score
score = calc_mean_auc(items_train, item_user_altered,[user_vecs, item_vecs.T], items_test)[0]
# update parameters if the score is higher
if score > best_score:
best_params['alpha'] = alpha
best_params['latent_factors'] = factor
best_params['regularizations'] = regularization
best_params['iterations'] = iteration
best_score = score
# print results
print("Best score:", score)
message = "Best parameters: alpha: {}, factors: {}, regulation: {}, interactions: {}"
print(message.format(best_params['alpha'],
best_params['latent_factors'],
best_params['regularizations'],
best_params['iterations']))
# return results
return score, best_params
As the cell above takes a long time to run, it will be kept as a markdown and the results displayed here.
For the Alternating Least Squares model, the optimum results found are:
As the cell above takes a long time to run, it will be kept as a markdown and the results displayed here.
For the Logistic Matrix Factorization, the optimum results found are:
As the cell above takes a long time to run, it will be kept as a markdown and the results displayed here.
For the Bayesian Personalized Ranking, the optimum results found are:
# Plot final results
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
models_names = ['ALS', 'BPR', 'LMF']
models_scores = [81.7, 60.3, 73.1]
ax.bar(models_names,models_scores)
plt.title("Recommendation models initial AUC scores")
plt.show()
# Train model with the optmium parameters
# get training and validation sets
items_train, items_test, item_user_altered = make_train(sparse_item_user, pct_test = 0.2)
# Initialize the Alternating Least Squares (ALS) model with initial parameters
model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=40)
# Fit the model using the training sparse content-person matrix
alpha = 15
data = (items_train * alpha).astype('double')
model.fit(data)
# Pick a product and find its most similar products and compare their descriptions
# Encoded Stock code of a product (randomly selected)
item_id = 2706
# Number of similiar products to find (the number - 1)
n_similar = 11
# Get the customer and item vectors from trained model
user_vecs = model.user_factors
item_vecs = model.item_factors
# Calculate the vector norms
item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))
# Calculate the similarity score
scores = item_vecs.dot(item_vecs[item_id]) /item_norms
# Get the top 10 most similar items
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
# Create a list of content-score tuples of most similar items
similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])
# Get product stockcode
item_stock_code = grouped_df[grouped_df['Stock_Code'] == item_id]['StockCode'].unique()[0]
# Get product description
item_description = df1[df1['StockCode'] == item_stock_code]['Description'].unique().tolist()
# Print product description
print("Most similar products to: ", item_description[0], ":\n")
for item in similar[1:]:
idx, score = item
# Get stock code of the similiar products
stock_code = grouped_df['StockCode'].loc[grouped_df['Stock_Code'] == idx].iloc[0]
# Get products descriptions
similar_products =df1[df1['StockCode']==stock_code]['Description'].unique().tolist()[0]
print(similar_products)
It is possible to see that the recommended similar products are very related to the original product.
# Create function return the top recommendations based on the customer/item vectors
# for items never interacted with before for any given person
def recommend(user_id, sparse_user_item, user_vecs, item_vecs, num_items=10):
# Get the interactions scores from the sparse user item matrix
user_interactions = sparse_user_item[user_id, :].toarray()
# Add 1 to everything, so that items with no interaction yet become equal to 1
user_interactions = user_interactions.reshape(-1) + 1
# Make items already interacted zero
user_interactions[user_interactions > 1] = 0
# Get dot product of user vector and all item vectors
rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()
# Scale the recommendation vector between 0 and 1
min_max = MinMaxScaler()
rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
# Items already interacted have their recommendation multiplied by zero
recommend_vector = user_interactions * rec_vector_scaled
# Sort the indices of the item into order of best recommendations
content_idx = np.argsort(recommend_vector)[::-1][:num_items]
# Start empty list to store item descriptions and scores
descriptions = []
scores = []
# Append descriptions and scores to the list
for idx in content_idx:
# get stock code of the the product
stock_code = grouped_df['StockCode'].loc[grouped_df['Stock_Code'] == idx].iloc[0]
# get products descriptions
item_description = df1[df1['StockCode']==stock_code]['Description'].unique().tolist()[0]
descriptions.append(item_description)
scores.append(recommend_vector[idx])
# Create dataframe with the resulting data
recommendations = pd.DataFrame({'Descriptions': descriptions, 'score': scores})
return recommendations
# Get the trained user and item vectors. Convert them to csr matrices
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)
# Create recommendations for customer with encoded id = 1838 (randomly chosen)
user_id = 1993
# Make recomendations
recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)
# Print recomendations
print(recommendations)
# Check recommendations by comparing them to the 10 items this user has most interacted with
most_bought_items = grouped_df[grouped_df['Customer_ID'] == user_id].sort_values(by=['Quantity'], ascending=False).head(10)
# Get item descriptions based on its StockCode
description_list = []
for index, row in most_bought_items.iterrows():
code = str(row['StockCode'])
description_list.append(df1[df1['StockCode'] == code]['Description'].unique()[0])
# Add item descriptions to the dataframe and filter relevant columns
most_bought_items['Description'] = description_list
most_bought_items = most_bought_items[['Description', 'Quantity']]
# Show the 10 items this user has most interacted with
most_bought_items
As it is possible to notice, the most purchased products from this user and the recommended products have very related descriptions.
As new users do not yet have interactions for the recommendation system to know their personal preferences, and as the only data available in the dataset about users is for their countries, first-time customers will be recommended the products that are currently the most sold in their regions.
# function to recommend the top 10 most recently sold products for a given country whithin a given number of last months
def recomendations_new_users(country, months):
# filter for the entered country
new_users_rec_df = df1[df1['Country'] == country]
# get initial month
initial_month = pd.Series(df1['Month_Year'].unique()).sort_values(ascending=False).iloc[months]
# filter for recent purchases
new_users_rec_df = new_users_rec_df[new_users_rec_df['Month_Year'] > initial_month]
# select most sold products
recommended_products_codes = new_users_rec_df.groupby(['StockCode'])['Quantity'].sum().sort_values(ascending=False)
recommended_products_codes = recommended_products_codes.head(10).index.tolist()
# initialize empty list
recommended_products_names = []
# get the names of the recommended products
for stock_code in recommended_products_codes:
product_description = new_users_rec_df[new_users_rec_df['StockCode'] == stock_code]['Description'].unique()[0]
recommended_products_names.append(product_description)
return recommended_products_names
# Recommend products to a new customer from United Kingdom based on the most sold products from the last 2 months
recomendations_new_users('United Kingdom', 2)
Most of the recommended products are Christmas items, since the purchases made in the last two months of the dataframe are close to Christmas.